Introduction

This section will focus on: 1. How to engineer a graph plot: + Igraph’s Plot + ggraph 2. Useful tips on how to effectively use visualization for graph analysis: + Filter + Aggregate + Query

library(tidyverse)
library(ggraph)
library(igraph)
library(igraphdata)

Base Plot

Before we try to tackle a big dataset, let’s try to plot a graph that we know has structure.

set.seed(4321)
tree_g <- make_tree(40, 4) %>%
  set_edge_attr('weight', value = sample(1:3, ecount(.), replace = T)) %>%
  set_vertex_attr('degree', value = degree(.))
set.seed(1234)
plot(tree_g)

set.seed(1234)
plot(tree_g,
     vertex.size = 5,
     vertex.label = 'node',
     vertex.label.cex = .75,
     edge.width = 2,
     edge.arrow.size = .5)

set.seed(1234)
plot(tree_g,
     vertex.size = V(tree_g)$degree * 3,
     vertex.label = ifelse(1:vcount(tree_g) %% 2, 'node', 'vertex'),
     vertex.label.cex = sample(c(.5, 1), vcount(tree_g), replace = T),
     edge.width = E(tree_g)$weight,
     edge.arrow.size = .5)

set.seed(1234)
plot(tree_g,
     vertex.label = NA,
     vertex.size = 10,
     vertex.color = 'hotpink',
     edge.color = 'midnightblue',
     edge.arrow.size = .5)

set.seed(1234)
plot(tree_g,
     vertex.label = NA,
     vertex.size = 10,
     vertex.color = map_chr(V(tree_g)$degree, function(x){switch(as.character(x), "4" = 'red', "5" = 'white', "1" = 'blue')}),
     edge.color = map_chr(E(tree_g)$weight, function(x){rgb(0, 0, 1, x/3)}),
     edge.arrow.size = .5)

V(tree_g)$color = map_chr(V(tree_g)$degree, function(x){switch(as.character(x), "4" = 'red', "5" = 'white', "1" = 'blue')})
V(tree_g)$size = V(tree_g)$degree * 4
V(tree_g)$label = NA
E(tree_g)$color = map_chr(E(tree_g)$weight, function(x){rgb(0, 0, 1, x/3)})
E(tree_g)$width = E(tree_g)$weight
E(tree_g)$arrow.size = .3
set.seed(1234)
plot(tree_g)

plot(tree_g, layout = layout_as_tree)

plot(tree_g, layout = layout_as_tree(tree_g, circular = T))

If you want to create your own matrix, then the first column of the matrix is the horizontal location (x) and the second column of the matrix is the vertical location (y). The default of igraph.plot is to make the x location dependent on the y location. If you do not want this behaviour, then change the aspect ration asp parameter to 0.

myLayout = cbind(c(vcount(tree_g)/2, 
                    1:4, 
                    map_dbl(1:16, function(x){vcount(tree_g) - x}),
                    map_dbl(1:19, function(x){vcount(tree_g)/2 - x})),
                 c(1, rep(vcount(tree_g)/4, 4), rep(vcount(tree_g)/2, 16), rep(vcount(tree_g), 19)))
plot(tree_g, layout = myLayout, asp = 0)
axis(1, at = c(-1, 0, 1), labels = c('0', 'vcount/2', 'vcount'))
axis(2, at = c(-1, 0, 1), labels = c('0', 'vcount/2', 'vcount'))

ggraph

set.seed(1234)
ggraph(tree_g) +
  geom_edge_fan() +
  geom_node_point() +
  theme_void()

set.seed(1234)
ggraph(tree_g) +
  geom_edge_fan(colour = 'blue',
                arrow = arrow()) +
  geom_node_point(size = 5, 
                  shape = 21, 
                  fill = 'rosybrown') +
  theme_void()
## Using `nicely` as default layout

set.seed(1234)
ggraph(tree_g) +
  geom_edge_fan(colour = 'blue',
                arrow = arrow(length = unit(3, 'mm')),
                start_cap = circle(3, 'mm'),
                end_cap = circle(3, 'mm')) +
  geom_node_point(size = 5, 
                  shape = 21, 
                  fill = 'rosybrown') +
  theme_void()
## Using `nicely` as default layout

set.seed(1234)
ggraph(tree_g) +
  geom_edge_fan(aes(color = weight, 
                    width = weight),
                arrow = arrow(length = unit(3, 'mm')),
                start_cap = circle(3, 'mm'),
                end_cap = circle(3, 'mm')) +
  geom_node_point(aes(size = degree, 
                      fill = as.character(degree)), 
                  shape = 21) +
  theme_void()

set.seed(1234)
ggraph(tree_g) +
  geom_edge_fan(aes(color = weight, 
                    width = weight),
                arrow = arrow(length = unit(3, 'mm')),
                start_cap = circle(3, 'mm'),
                end_cap = circle(3, 'mm')) +
  geom_node_point(aes(size = degree, 
                      fill = as.character(degree)), 
                  shape = 21) +
  theme_void() +
  scale_edge_color_continuous(low = '#ece7f2', high = '#2b8cbe') +
  scale_edge_width(range = c(.5, 1.5)) +
  scale_size_area(max_size = 10) +
  scale_fill_manual(values = c( "4" = 'red', "5" = 'white', "1" = 'blue'))

set.seed(1234)
ggraph(tree_g, layout = 'tree') +
  geom_edge_fan(aes(color = weight, 
                    width = weight),
                arrow = arrow(length = unit(3, 'mm')),
                start_cap = circle(3, 'mm'),
                end_cap = circle(3, 'mm')) +
  geom_node_point(aes(size = degree, 
                      fill = as.character(degree)), 
                  shape = 21) +
  theme_void() +
  scale_edge_color_continuous(low = '#ece7f2', high = '#2b8cbe') +
  scale_edge_width(range = c(.5, 1.5)) +
  scale_size_area(max_size = 10) +
  scale_fill_manual(values = c( "4" = 'red', "5" = 'white', "1" = 'blue'))

set.seed(1234)
ggraph(tree_g, layout = 'tree', circular = T) +
  geom_edge_fan(aes(color = weight, 
                    width = weight),
                arrow = arrow(length = unit(3, 'mm')),
                start_cap = circle(3, 'mm'),
                end_cap = circle(3, 'mm')) +
  geom_node_point(aes(size = degree, 
                      fill = as.character(degree)), 
                  shape = 21) +
  theme_void() +
  scale_edge_color_continuous(low = '#ece7f2', high = '#2b8cbe') +
  scale_edge_width(range = c(.5, 1.5)) +
  scale_size_area(max_size = 10) +
  scale_fill_manual(values = c( "4" = 'red', "5" = 'white', "1" = 'blue'))

set.seed(1234)
ggraph(tree_g, layout = 'manual', node.positions = data.frame(x = myLayout[,1], y = myLayout[,2])) +
  geom_edge_fan(aes(color = weight, 
                    width = weight),
                arrow = arrow(length = unit(3, 'mm')),
                start_cap = circle(3, 'mm'),
                end_cap = circle(3, 'mm')) +
  geom_node_point(aes(size = degree, 
                      fill = as.character(degree)), 
                  shape = 21) +
  theme_void() +
  scale_edge_color_continuous(low = '#ece7f2', high = '#2b8cbe') +
  scale_edge_width(range = c(.5, 1.5)) +
  scale_size_area(max_size = 10) +
  scale_fill_manual(values = c( "4" = 'red', "5" = 'white', "1" = 'blue'))

data(enron)
enron
## IGRAPH 64ec693 D--- 184 125409 -- Enron email network
## + attr: LDC_names (g/c), LDC_desc (g/c), name (g/c), Citation
## | (g/c), Email (v/c), Name (v/c), Note (v/c), Time (e/c),
## | Reciptype (e/c), Topic (e/n), LDC_topic (e/n)
## + edges from 64ec693:
##  [1]  25->154  25->154  30-> 30  30-> 30  30-> 30  30-> 30  39-> 39
##  [8]  52-> 67  52-> 67  52-> 67  52-> 67  61->100  61->100  61->163
## [15]  61->163  61->166  61->166  61->170  64-> 59  64-> 59  64-> 64
## [22]  64-> 64  64->147  64->147  64->164  64->164  64->168  66-> 66
## [29]  66-> 66  67->129  67->129  67->129  67->129  93-> 10  93-> 10
## [36]  93-> 10  93-> 10  93-> 39  93-> 39  93-> 93  93-> 93  93-> 93
## + ... omitted several edges
graph_attr(enron) %>%
  lapply(head)
## $LDC_names
## [1] "Calif_analysis"     "Calif_bankruptcy"   "Calif_utilities"   
## [4] "Calif_crisis_legal" "Calif_enron"        "Calif_federal"     
## 
## $LDC_desc
## [1] "Executive summaries and analyses about the California situation. (304 entries)"                                                                                     
## [2] "Specifically mentioned financial difficulties of the utilities such as Southern California Edison (SoCal Edison) and Pacific Gas & Electric (PG & E). (36 entries)" 
## [3] "General references to California utility companies: Edison, Pacific Gas & Electric, and the California Public Utility Commission (CPUC) which regulates them. (116)"
## [4] "Articles about legal issues surrounding California energy crisis.  (109)"                                                                                           
## [5] "Enron business emails about the day to day operations of managing the California side of their business. (699)"                                                     
## [6] "Emails about FERC (Federal Energy Regulatory Commission), U.S.  Senate Hearings. (61)"                                                                              
## 
## $name
## [1] "Enron email network"
## 
## $Citation
## [1] "C.E. Priebe, J.M. Conroy, D.J. Marchette, and Y. Park, \"Scan Statistics on Enron Graphs,\" Computational and Mathematical Organization Theory, Volume 11, Number 3, p229 - 247, October 2005, Springer Science+Business Media B.V."
vertex_attr(enron) %>% 
  lapply(head)
## $Email
## [1] "albert.meyers" "a..martin"     "andrea.ring"   "andrew.lewis" 
## [5] "andy.zipper"   "a..shankman"  
## 
## $Name
## [1] "Albert Meyers"    "Thomas Martin"    "Andrea Ring"     
## [4] "Andrew Lewis"     "Andy Zipper"      "Jeffrey Shankman"
## 
## $Note
## [1] "Employee, Specialist"         "Vice President"              
## [3] "NA"                           "Director"                    
## [5] "Vice President, Enron Online" "President, Enron Global Mkts"

There’s no name attribute (it has to be lower case), so let’s make one now. Remember, the name attribute has to be unique for each node.

vertex_attr(enron) %>%
  lapply(function(x){x %>% unique %>% length})
## $Email
## [1] 184
## 
## $Name
## [1] 126
## 
## $Note
## [1] 42

So it seems that the Email attribute is the best candidate for the name because the number of unique Emails matches the number of unique nodes (184)

V(enron)$name <- V(enron)$Email
edge_attr(enron) %>%
  lapply(head)
## $Time
## [1] "1979-12-31 21:00:00" "1979-12-31 21:00:00" "1979-12-31 21:00:00"
## [4] "1979-12-31 21:00:00" "1979-12-31 21:00:00" "1979-12-31 21:00:00"
## 
## $Reciptype
## [1] "to"  "to"  "cc"  "cc"  "bcc" "bcc"
## 
## $Topic
## [1] 1 1 3 3 3 3
## 
## $LDC_topic
## [1]  0 -1 -1 -1 -1 -1
set.seed(1234)
ggraph(enron) +
  geom_edge_fan(alpha = .1) +
  geom_node_point()
## Using `nicely` as default layout

Filter

Let’s take a look at the the years really fast

E(enron)$Time %>%
  str_extract('\\d{4}\\-\\d{2}') %>%
  unique %>%
  sort
##  [1] "1979-12" "1998-11" "1998-12" "1999-01" "1999-02" "1999-03" "1999-04"
##  [8] "1999-05" "1999-06" "1999-07" "1999-08" "1999-09" "1999-10" "1999-11"
## [15] "1999-12" "2000-01" "2000-02" "2000-03" "2000-04" "2000-05" "2000-06"
## [22] "2000-07" "2000-08" "2000-09" "2000-10" "2000-11" "2000-12" "2001-01"
## [29] "2001-02" "2001-03" "2001-04" "2001-05" "2001-06" "2001-07" "2001-08"
## [36] "2001-09" "2001-10" "2001-11" "2001-12" "2002-01" "2002-02" "2002-03"
## [43] "2002-04" "2002-05" "2002-06"

Did you know that Enron was founded in 1985? I think this might be something looking into. Let’s filter the graph down to only include edges from 1979.

old_enron <- enron - E(enron)[!str_detect(E(enron)$Time, '^1979')]
old_enron
## IGRAPH 2a6f2a7 DN-- 184 174 -- Enron email network
## + attr: LDC_names (g/c), LDC_desc (g/c), name (g/c), Citation
## | (g/c), Email (v/c), Name (v/c), Note (v/c), name (v/c), Time
## | (e/c), Reciptype (e/c), Topic (e/n), LDC_topic (e/n)
## + edges from 2a6f2a7 (vertex names):
##  [1] daren.farmer     ->sally.beck       
##  [2] daren.farmer     ->sally.beck       
##  [3] debra.perlingiere->debra.perlingiere
##  [4] debra.perlingiere->debra.perlingiere
##  [5] debra.perlingiere->debra.perlingiere
##  [6] debra.perlingiere->debra.perlingiere
## + ... omitted several edges
ggraph(old_enron) + 
  geom_edge_fan(alpha = .1) +
  geom_node_point()
## Using `nicely` as default layout

Aggregate

Query